#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
const char* winogradTransform_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define GLOBAL_SIZE_DIM2 "" __private int global_size_dim0,__private int global_size_dim1,\n"
"#define UNIFORM_BOUNDRY_CHECK(index0, index1) "" if(index0 >= global_size_dim0 || index1 >= global_size_dim1) { "" return; "" }\n"
"// [dstChannel,srcChannel,3,3] -> [4x4,srcChannelPad,dstChannelpad] (N,Kpad,Npad)\n"
"__kernel void winoTransWeightBuf2_3_1(GLOBAL_SIZE_DIM2\n"
" __global const float* input,// 0\n"
" __global FLOAT* output,\n"
" __private const int srcChannel,// 3\n"
" __private const int dstChannel,\n"
" __private const int srcChannelPad,// 6\n"
" __private const int dstChannelPad\n"
") {\n"
" int2 pos=(int2)(get_global_id(0),get_global_id(1));\n"
" UNIFORM_BOUNDRY_CHECK(pos.x,pos.y);\n"
" \n"
" const int src_c=pos.x;\n"
" const int dst_c=pos.y;\n"
" \n"
" const int out_offset=(0*srcChannelPad+src_c)*dstChannelPad+dst_c;\n"
" const int out_offset_add=srcChannelPad*dstChannelPad;\n"
" if(src_c >= srcChannel || dst_c >= dstChannel) {\n"
" for(int i=0; i<16; i++) {\n"
" output[out_offset+i*out_offset_add]=(FLOAT)0;\n"
" }\n"
" return;\n"
" }\n"
" \n"
" const int in_offset=(dst_c*srcChannel+src_c)*9;\n"
" FLOAT8 in=CONVERT_FLOAT8(vload8(0,input+in_offset));\n"
" FLOAT in8=input[in_offset+8];\n"
" \n"
" FLOAT GB_00=in.s0;\n"
" FLOAT GB_01=in.s1;\n"
" FLOAT GB_02=in.s2;\n"
" FLOAT GB_10=in.s0+in.s3+in.s6;\n"
" FLOAT GB_11=in.s1+in.s4+in.s7;\n"
" FLOAT GB_12=in.s2+in.s5+in8;\n"
" FLOAT GB_20=in.s0-in.s3+in.s6;\n"
" FLOAT GB_21=in.s1-in.s4+in.s7;\n"
" FLOAT GB_22=in.s2-in.s5+in8;\n"
" FLOAT GB_30=in.s6;\n"
" FLOAT GB_31=in.s7;\n"
" FLOAT GB_32=in8;\n"
" \n"
" FLOAT GBGT_00=GB_00;\n"
" FLOAT GBGT_01=GB_00+GB_01+GB_02;\n"
" FLOAT GBGT_02=GB_00-GB_01+GB_02;\n"
" FLOAT GBGT_03=GB_02;\n"
" \n"
" FLOAT GBGT_10=GB_10;\n"
" FLOAT GBGT_11=GB_10+GB_11+GB_12;\n"
" FLOAT GBGT_12=GB_10-GB_11+GB_12;\n"
" FLOAT GBGT_13=GB_12;\n"
" \n"
" FLOAT GBGT_20=GB_20;\n"
" FLOAT GBGT_21=GB_20+GB_21+GB_22;\n"
" FLOAT GBGT_22=GB_20-GB_21+GB_22;\n"
" FLOAT GBGT_23=GB_22;\n"
" \n"
" FLOAT GBGT_30=GB_30;\n"
" FLOAT GBGT_31=GB_30+GB_31+GB_32;\n"
" FLOAT GBGT_32=GB_30-GB_31+GB_32;\n"
" FLOAT GBGT_33=GB_32;\n"
" output[out_offset+0*out_offset_add]=GBGT_00;\n"
" output[out_offset+1*out_offset_add]=GBGT_01;\n"
" output[out_offset+2*out_offset_add]=GBGT_02;\n"
" output[out_offset+3*out_offset_add]=GBGT_03;\n"
" output[out_offset+4*out_offset_add]=GBGT_10;\n"
" output[out_offset+5*out_offset_add]=GBGT_11;\n"
" output[out_offset+6*out_offset_add]=GBGT_12;\n"
" output[out_offset+7*out_offset_add]=GBGT_13;\n"
" output[out_offset+8*out_offset_add]=GBGT_20;\n"
" output[out_offset+9*out_offset_add]=GBGT_21;\n"
" output[out_offset+10*out_offset_add]=GBGT_22;\n"
" output[out_offset+11*out_offset_add]=GBGT_23;\n"
" output[out_offset+12*out_offset_add]=GBGT_30;\n"
" output[out_offset+13*out_offset_add]=GBGT_31;\n"
" output[out_offset+14*out_offset_add]=GBGT_32;\n"
" output[out_offset+15*out_offset_add]=GBGT_33;\n"
"}\n"
"__kernel void winoTransSrcBuf2_3_1(GLOBAL_SIZE_DIM2\n"
" __global const FLOAT* uInput,// 0\n"
" __global FLOAT* uOutput,__private const int unitWidth,\n"
" __private const int unitHeight,// 3\n"
" __private const int padX,__private const int padY,\n"
" __private const int srcWidth,// 6\n"
" __private const int srcHeight,__private const int srcChannelC4,\n"
" __private const int dstHeightPad,__private const int srcChannelPad,\n"
" __private const int batch,\n"
" __private const int batchOffset) {\n"
" int2 pos=(int2)(get_global_id(0),get_global_id(1)); \n"
" UNIFORM_BOUNDRY_CHECK(pos.x,pos.y);\n"
" \n"
" if(pos.x >= unitWidth*unitHeight || pos.y >= srcChannelC4) {\n"
" return;\n"
" }\n"
" int unitWidth_idx=pos.x % unitWidth;\n"
" int unitHeight_idx=pos.x/unitWidth;\n"
" int2 realPos=(int2)(unitWidth_idx,unitHeight_idx);\n"
" int dstXOrigin=pos.y;\n"
" int batchIndex=pos.y/srcChannelC4;\n"
" int srcZ=pos.y % srcChannelC4;\n"
" int dstYOrigin=unitWidth*unitHeight_idx+unitWidth_idx;\n"
" batchIndex=batchOffset;\n"
" {\n"
" int sxStart=(realPos.x)*2-padX;\n"
" int syStart=(realPos.y)*2-padY;\n"
" FLOAT4 S00;\n"
" FLOAT4 S10;\n"
" FLOAT4 S20;\n"
" FLOAT4 S30;\n"
" FLOAT4 S01;\n"
" FLOAT4 S11;\n"
" FLOAT4 S21;\n"
" FLOAT4 S31;\n"
" FLOAT4 S02;\n"
" FLOAT4 S12;\n"
" FLOAT4 S22;\n"
" FLOAT4 S32;\n"
" FLOAT4 S03;\n"
" FLOAT4 S13;\n"
" FLOAT4 S23;\n"
" FLOAT4 S33;\n"
" \n"
" int inp_offset=(((batchIndex+srcZ*batch)*srcHeight+syStart)*srcWidth+sxStart)*4;\n"
" {\n"
" int sx=0+sxStart;\n"
" int sy=0+syStart;\n"
" \n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S00=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset);\n"
" }\n"
" {\n"
" int sx=1+sxStart;\n"
" int sy=0+syStart;\n"
" \n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S10=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+4);\n"
" }\n"
" {\n"
" int sx=2+sxStart;\n"
" int sy=0+syStart;\n"
" \n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S20=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+8);\n"
" }\n"
" {\n"
" int sx=3+sxStart;\n"
" int sy=0+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S30=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+12);\n"
" }\n"
" {\n"
" int sx=0+sxStart;\n"
" int sy=1+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S01=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+4*srcWidth);\n"
" }\n"
" {\n"
" int sx=1+sxStart;\n"
" int sy=1+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S11=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+4*srcWidth+4);\n"
" }\n"
" {\n"
" int sx=2+sxStart;\n"
" int sy=1+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S21=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+4*srcWidth+8);\n"
" }\n"
" {\n"
" int sx=3+sxStart;\n"
" int sy=1+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S31=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+4*srcWidth+12);\n"
" }\n"
" {\n"
" int sx=0+sxStart;\n"
" int sy=2+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S02=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+8*srcWidth);\n"
" }\n"
" {\n"
" int sx=1+sxStart;\n"
" int sy=2+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S12=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+8*srcWidth+4);\n"
" }\n"
" {\n"
" int sx=2+sxStart;\n"
" int sy=2+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S22=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+8*srcWidth+8);\n"
" }\n"
" {\n"
" int sx=3+sxStart;\n"
" int sy=2+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S32=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+8*srcWidth+12);\n"
" }\n"
" {\n"
" int sx=0+sxStart;\n"
" int sy=3+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S03=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+12*srcWidth);\n"
" }\n"
" {\n"
" int sx=1+sxStart;\n"
" int sy=3+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S13=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+12*srcWidth+4);\n"
" }\n"
" {\n"
" int sx=2+sxStart;\n"
" int sy=3+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S23=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+12*srcWidth+8);\n"
" }\n"
" {\n"
" int sx=3+sxStart;\n"
" int sy=3+syStart;\n"
" bool outBound=(sx<0 || sx >= srcWidth || sy<0 || sy >= srcHeight);\n"
" S33=outBound ? (FLOAT4)(0) : vload4(0,uInput+inp_offset+12*srcWidth+12);\n"
" }\n"
" FLOAT4 m00=+S00-S02;\n"
" FLOAT4 m10=+S10-S12;\n"
" FLOAT4 m20=+S20-S22;\n"
" FLOAT4 m30=+S30-S32;\n"
" FLOAT4 m01=+(FLOAT)0.5f*S01+(FLOAT)0.5f*S02;\n"
" FLOAT4 m11=+(FLOAT)0.5f*S11+(FLOAT)0.5f*S12;\n"
" FLOAT4 m21=+(FLOAT)0.5f*S21+(FLOAT)0.5f*S22;\n"
" FLOAT4 m31=+(FLOAT)0.5f*S31+(FLOAT)0.5f*S32;\n"
" FLOAT4 m02=-(FLOAT)0.5f*S01+(FLOAT)0.5f*S02;\n"
" FLOAT4 m12=-(FLOAT)0.5f*S11+(FLOAT)0.5f*S12;\n"
" FLOAT4 m22=-(FLOAT)0.5f*S21+(FLOAT)0.5f*S22;\n"
" FLOAT4 m32=-(FLOAT)0.5f*S31+(FLOAT)0.5f*S32;\n"
" FLOAT4 m03=-S01+S03;\n"
" FLOAT4 m13=-S11+S13;\n"
" FLOAT4 m23=-S21+S23;\n"
" FLOAT4 m33=-S31+S33;\n"
" \n"
" //NC4HW4 [alpha*alpha,srcChannelPad,dstHeightPad]\n"
" //index: [0,dstXOrigin,dstY,dstYOrigin % 4]\n"
" int out_offset=(0*srcChannelPad+4*dstXOrigin)*dstHeightPad+dstYOrigin;\n"
" int batch_offset=srcChannelPad*dstHeightPad;\n"
" \n"
" FLOAT4 res=(+m00-m20);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" out_offset += batch_offset;\n"
" res=(+(FLOAT)0.5f*m10+(FLOAT)0.5f*m20);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(-(FLOAT)0.5f*m10+(FLOAT)0.5f*m20);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(-m10+m30);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" \n"
" out_offset += batch_offset;\n"
" res=(+m01-m21);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(+(FLOAT)0.5f*m11+(FLOAT)0.5f*m21);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(-(FLOAT)0.5f*m11+(FLOAT)0.5f*m21);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(-m11+m31);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(+m02-m22);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(+(FLOAT)0.5f*m12+(FLOAT)0.5f*m22);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(-(FLOAT)0.5f*m12+(FLOAT)0.5f*m22);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(-m12+m32);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(+m03-m23);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(+(FLOAT)0.5f*m13+(FLOAT)0.5f*m23);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(-(FLOAT)0.5f*m13+(FLOAT)0.5f*m23);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" \n"
" out_offset += batch_offset;\n"
" res=(-m13+m33);\n"
" uOutput[out_offset]=res.x;\n"
" uOutput[out_offset+dstHeightPad]=res.y;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad]=res.z;\n"
" uOutput[out_offset+dstHeightPad+dstHeightPad+dstHeightPad]=res.w;\n"
" }\n"
"}\n"
"__kernel void winoTransDstBuf2_3_1(GLOBAL_SIZE_DIM2\n"
" __global const FLOAT* uInput,\n"
" __global const FLOAT* uBias,\n"
" __global FLOAT* uOutput,\n"
" __private const int unitWidth,//wUnit\n"
" __private const int unitHeight,//hUnit\n"
" __private const int dstWidth,\n"
" __private const int dstHeight,\n"
" __private const int dstChannelC4,\n"
" __private const int srcWidthPad,\n"
" __private const int dstChannelPad,\n"
" __private const int batch,\n"
" __private const int batchOffset) {\n"
" int2 pos=(int2)(get_global_id(0),get_global_id(1));\n"
" UNIFORM_BOUNDRY_CHECK(pos.x,pos.y);\n"
" int unitWidth_idx=pos.x % unitWidth;\n"
" int unitHeight_idx=pos.x/unitWidth;\n"
" int2 realPos=(int2)(unitWidth_idx,unitHeight_idx);\n"
" int dstXOrigin=unitWidth*unitHeight_idx+unitWidth_idx;\n"
" int oz=pos.y % dstChannelC4;\n"
" \n"
" FLOAT4 bias=vload4(0,uBias+oz*4);\n"
" int batchIndex=pos.y/dstChannelC4;\n"
" batchIndex=batchOffset;\n"
" {\n"
" int oyStart=realPos.y*2;\n"
" int oxStart=realPos.x*2;\n"
" \n"
" // [alpha2,srcWidthPad,dstChannelPad]\n"
" //index: [0,dstXOrigin,4*oz]\n"
" const int inp_offset=(0*srcWidthPad+dstXOrigin)*dstChannelPad+4*oz;\n"
" const int b_offset=dstChannelPad*srcWidthPad;\n"
" FLOAT4 S00=vload4(0,uInput+inp_offset+b_offset*0);\n"
" FLOAT4 S10=vload4(0,uInput+inp_offset+b_offset*1);\n"
" FLOAT4 S20=vload4(0,uInput+inp_offset+b_offset*2);\n"
" FLOAT4 S30=vload4(0,uInput+inp_offset+b_offset*3);\n"
" FLOAT4 S01=vload4(0,uInput+inp_offset+b_offset*4);\n"
" FLOAT4 S11=vload4(0,uInput+inp_offset+b_offset*5);\n"
" FLOAT4 S21=vload4(0,uInput+inp_offset+b_offset*6);\n"
" FLOAT4 S31=vload4(0,uInput+inp_offset+b_offset*7);\n"
" FLOAT4 S02=vload4(0,uInput+inp_offset+b_offset*8);\n"
" FLOAT4 S12=vload4(0,uInput+inp_offset+b_offset*9);\n"
" FLOAT4 S22=vload4(0,uInput+inp_offset+b_offset*10);\n"
" FLOAT4 S32=vload4(0,uInput+inp_offset+b_offset*11);\n"
" FLOAT4 S03=vload4(0,uInput+inp_offset+b_offset*12);\n"
" FLOAT4 S13=vload4(0,uInput+inp_offset+b_offset*13);\n"
" FLOAT4 S23=vload4(0,uInput+inp_offset+b_offset*14);\n"
" FLOAT4 S33=vload4(0,uInput+inp_offset+b_offset*15);\n"
" FLOAT4 m00=+S00+S01+S02;\n"
" FLOAT4 m10=+S10+S11+S12;\n"
" FLOAT4 m20=+S20+S21+S22;\n"
" FLOAT4 m30=+S30+S31+S32;\n"
" FLOAT4 m01=+S01-S02+S03;\n"
" FLOAT4 m11=+S11-S12+S13;\n"
" FLOAT4 m21=+S21-S22+S23;\n"
" FLOAT4 m31=+S31-S32+S33;\n"
" \n"
" //NC4HW4 [batch,dstChannelC4,dstHeight,dstWidth]\n"
" //index: [batchIndex,oz,oyStart,oxStart]\n"
" int out_offset=(((batchIndex+oz*batch)*dstHeight+oyStart)*dstWidth+oxStart)*4;\n"
" {\n"
" int ox=oxStart+0;\n"
" int oy=oyStart+0;\n"
" if (ox<dstWidth && oy<dstHeight) {\n"
" FLOAT4 res=bias+m00+m10+m20;\n"
"#ifdef RELU\n"
" res=max(res,(FLOAT4)(0));\n"
"#endif\n"
"#ifdef RELU6\n"
" res=clamp(res,(FLOAT4)(0),(FLOAT4)(6));\n"
"#endif\n"
" vstore4(res,0,uOutput+out_offset);\n"
" }\n"
" }\n"
" {\n"
" int ox=oxStart+1;\n"
" int oy=oyStart+0;\n"
" if (ox<dstWidth && oy<dstHeight) {\n"
" FLOAT4 res=bias+m10-m20+m30;\n"
"#ifdef RELU\n"
" res=max(res,(FLOAT4)(0));\n"
"#endif\n"
"#ifdef RELU6\n"
" res=clamp(res,(FLOAT4)(0),(FLOAT4)(6));\n"
"#endif\n"
" vstore4(res,0,uOutput+out_offset+4);\n"
" }\n"
" }\n"
" {\n"
" int ox=oxStart+0;\n"
" int oy=oyStart+1;\n"
" if (ox<dstWidth && oy<dstHeight) {\n"
" FLOAT4 res=bias+m01+m11+m21;\n"
"#ifdef RELU\n"
" res=max(res,(FLOAT4)(0));\n"
"#endif\n"
"#ifdef RELU6\n"
" res=clamp(res,(FLOAT4)(0),(FLOAT4)(6));\n"
"#endif\n"
" vstore4(res,0,uOutput+out_offset+4*dstWidth);\n"
" }\n"
" }\n"
" {\n"
" int ox=oxStart+1;\n"
" int oy=oyStart+1;\n"
" if (ox<dstWidth && oy<dstHeight) {\n"
" FLOAT4 res=bias+m11-m21+m31;\n"
"#ifdef RELU\n"
" res=max(res,(FLOAT4)(0));\n"
"#endif\n"
"#ifdef RELU6\n"
" res=clamp(res,(FLOAT4)(0),(FLOAT4)(6));\n"
"#endif\n"
" vstore4(res,0,uOutput+out_offset+4*dstWidth+4);\n"
" }\n"
" }\n"
" }\n"
"}\n"
;
#endif
}
